Code
library(tidyverse)
library(here)
library(dplyr)
library(tidyr)
library(broom)
library(gt)
library(glue)
library(DT)
library(janitor)library(tidyverse)
library(here)
library(dplyr)
library(tidyr)
library(broom)
library(gt)
library(glue)
library(DT)
library(janitor)names <- read_csv(here("week 9", "lab 9", "StateNames_A.csv"))#data cleaning
cleaned_names <- names |>
rename(Sex = Gender)
datatable(cleaned_names)cleaned_names_table <- cleaned_names |>
filter(Name == "Allison") |>
group_by(State, Sex) |>
summarize(Count = sum(Count),
.groups = "drop") |>
pivot_wider(names_from = Sex,
values_from = Count,
values_fill = 0) |>
gt() |>
tab_header(
title = md("**Frequency Table of Babies Named Allison**"),
subtitle = "for 192,825 babies"
) |>
cols_label(
F = "Female",
M = "Male")
cleaned_names_table| Frequency Table of Babies Named Allison | ||
| for 192,825 babies | ||
| State | Female | Male |
|---|---|---|
| AK | 232 | 0 |
| AL | 1535 | 0 |
| AR | 1198 | 0 |
| AZ | 1880 | 0 |
| CA | 12413 | 0 |
| CO | 1594 | 0 |
| CT | 1099 | 0 |
| DC | 321 | 0 |
| DE | 294 | 0 |
| FL | 4455 | 0 |
| GA | 3257 | 0 |
| HI | 183 | 0 |
| IA | 1477 | 0 |
| ID | 451 | 0 |
| IL | 5110 | 0 |
| IN | 3067 | 0 |
| KS | 1283 | 0 |
| KY | 1905 | 20 |
| LA | 1209 | 0 |
| MA | 2218 | 0 |
| MD | 2229 | 0 |
| ME | 340 | 0 |
| MI | 4014 | 0 |
| MN | 2374 | 0 |
| MO | 2882 | 0 |
| MS | 817 | 0 |
| MT | 226 | 0 |
| NC | 3435 | 0 |
| ND | 285 | 0 |
| NE | 807 | 0 |
| NH | 412 | 0 |
| NJ | 3052 | 0 |
| NM | 399 | 0 |
| NV | 729 | 0 |
| NY | 5747 | 0 |
| OH | 5487 | 0 |
| OK | 1421 | 0 |
| OR | 1186 | 0 |
| PA | 4307 | 0 |
| RI | 306 | 0 |
| SC | 1228 | 0 |
| SD | 376 | 0 |
| TN | 2488 | 0 |
| TX | 10192 | 0 |
| UT | 1125 | 0 |
| VA | 3220 | 0 |
| VT | 135 | 0 |
| WA | 1956 | 0 |
| WI | 2367 | 0 |
| WV | 813 | 0 |
| WY | 142 | 0 |
allison_f which contains only the babies assigned Female at birth.allison_f <- cleaned_names |>
filter(Name == "Allison",
Sex == "F")allison_totals <- allison_f |>
group_by(Year) |>
summarize(total = sum(Count))
ggplot(data = allison_totals,
mapping = aes(x = Year,
y = total)) +
geom_line() +
labs(title = "Popularity Total of the Name Allisons Over the years",
x = "Year",
y = "")Fit a linear model with the year as the explanatory variable, and the number of Allisons as the response. Similar to #3, each year should have one observation–the total number of Allisons born that year.
allison_lm <- lm(total ~ Year,
data = allison_totals)
allison_tidy_table <- broom::tidy(allison_lm) |>
gt() |>
tab_header(
title = md("**Regression Results Table**")) |>
tab_style(
style = list(
cell_text(font = "Times",
weight = "lighter",
size = "11px")),
locations = cells_body(columns = everything(),
rows = 1:2)
)
allison_tidy_table| Regression Results Table | ||||
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 209689.7609 | 42971.50497 | 4.879740 | 0.0001669256 |
| Year | -101.5191 | 21.42676 | -4.737959 | 0.0002228430 |
allison_residual <- augment(allison_lm,
newdata = allison_totals)
ggplot(data = allison_residual,
mapping = aes(x = Year,
y = .resid)) +
geom_point() +
labs(title = "Residuals of the Total Allisons over the Years",
x = "Year",
y = "")For the earlier years, residuals are higher, meaning that we may have under-predicted the popularity of the name Allison. For later years, residuals are lower, meaning that we may have over-predicted the popularity of the name Allison
I conclude that your name’s popularity has been decreasing from 1998 to 2005 and increased a bit from 2005 to 2009 and decreased again from 2009 to 2014. Your name is not as popular as it was in the earlier years.
Name_order <- c("Alan", "Allen", "Allan")
male_names_totals <- cleaned_names |>
filter(Name %in% c("Allen",
"Alan",
"Allan")) |>
group_by(Name, Year) |>
summarize(total = sum(Count)) |>
mutate(Name = factor(Name,
levels = Name_order))
ggplot(data = male_names_totals,
mapping = aes(x = Year,
y = total,
color = Name)) +
geom_line() +
labs(title = "Popularity Total of the Names: Allan, Alan, and Allen Over the Years",
x = "Year",
y = "")Allan_totals <- cleaned_names |>
filter(Year == 2000,
State %in% c("CA","PA"),
Name %in% c("Allan",
"Alan",
"Allen")) |>
group_by(State, Name) |>
summarize(total = sum(Count),
.groups = "drop") |>
pivot_wider(names_from = Name,
values_from = total,
values_fill = 0)
Allan_table <- gt(Allan_totals) |>
tab_header(
title = md("**Frequency of Allans**"),
subtitle = ("Year: 2000")) |>
tab_style(
style = list(
cell_borders(sides = "all",
color = "#87CEEB",
weight = px(1.5),
style = "solid"
),
cell_fill(color = "#FFC0CB", alpha = NULL),
cell_text(font = "Impact",
weight = "lighter",
size = "14px")),
locations = cells_body())
Allan_table| Frequency of Allans | |||
| Year: 2000 | |||
| State | Alan | Allan | Allen |
|---|---|---|---|
| CA | 584 | 131 | 176 |
| PA | 51 | 12 | 56 |
# I learned how to tab_style on https://gt.rstudio.com/reference/tab_style.html. I think that it can make your table more creative. 10.
Allan_total_percentage <- cleaned_names |>
filter(Year == 2000,
State %in% c("CA","PA"),
Name %in% c("Allan", "Alan", "Allen")) |>
group_by(State, Name) |>
summarize(total = sum(Count),
.groups = "drop") |>
mutate(prop = total / sum(total)) |>
group_by(State) |>
ungroup() |>
select(-total) |>
pivot_wider(names_from = Name,
values_from = prop,
values_fill = 0)
Allan_percent_table <- gt(Allan_total_percentage) |>
fmt_percent(columns = 2:4, decimals = 2) |>
tab_header(
title = md("**<span style = 'color:#BA55D3;'>Frequency of Allans</span>**"),
subtitle = ("Year: 2000")) |>
tab_style(
style = list(
cell_text(font = "Georgia",
weight = "bold",
size = "14px",
align = "right")),
locations = cells_body())
Allan_percent_table| Frequency of Allans | |||
| Year: 2000 | |||
| State | Alan | Allan | Allen |
|---|---|---|---|
| CA | 57.82% | 12.97% | 17.43% |
| PA | 5.05% | 1.19% | 5.54% |